#' Original outlierKD function by By Klodian Dhana,
#' https://www.r-bloggers.com/identify-describe-plot-and-remove-the-outliers-from-the-dataset/
#' Modified to have third argument for removing outliers instead of interactive prompt,
#' and after removing outlier, original df will not be changed. The function returns the a df,
#' which can be saved as original df name if desired.
#' Also added QQ-plot in the output, with options to show/hide boxplot, histogram, qqplot.
#' Check outliers, and option to remove them, save as a new dataframe.
#' @param df The dataframe.
#' @param var The variable in the dataframe to be checked for outliers
#' @param rm Boolean. Whether to remove outliers or not.
#' @param boxplt Boolean. Whether to show the boxplot, before and after outliers removed.
#' @param histogram Boolean. Whether to show the histogram, before and after outliers removed.
#' @param qqplt Boolean. Whether to show the qqplot, before and after outliers removed.
#' @return The dataframe with outliers replaced by NA if rm==TRUE, or df if nothing changed
#' @examples
#'   outlierKD2(mydf, height, FALSE, TRUE, TRUE, TRUE)
#'   mydf = outlierKD2(mydf, height, TRUE, TRUE, TRUE, TRUE)
#'   mydfnew = outlierKD2(mydf, height, TRUE)
#' @export
outlierKD2 <- function(df, var, rm=TRUE, boxplt=TRUE, histogram=TRUE, qqplt=TRUE) {
  dt = df # duplicate the dataframe for potential alteration
  var_name <- eval(substitute(var),eval(dt))
  na1 <- sum(is.na(var_name))
  m1 <- mean(var_name, na.rm = T)
  colTotal <- boxplt+histogram+qqplt
  par(mfrow=c(2, max(2,colTotal)), oma=c(0,0,3,0)) # fixed issue with only 0 or 1 chart selected
  if (qqplt) {
    qqnorm(var_name, main = "With outliers")
    qqline(var_name)
  }
  if (histogram) { hist(var_name, main="With outliers", xlab=NA, ylab=NA) }
  if (boxplt) { boxplot(var_name, main="With outliers") }

  outlier <- boxplot.stats(var_name)$out
  mo <- mean(outlier)
  var_name <- ifelse(var_name %in% outlier, NA, var_name)
  if (qqplt) {
    qqnorm(var_name, main = "Without outliers")
    qqline(var_name)
  }
  if (histogram) { hist(var_name, main="Without outliers", xlab=NA, ylab=NA) }
  if (boxplt) { boxplot(var_name, main="Without outliers") }
  
  if(colTotal > 0) {  # if no charts are wanted, skip this section
    title("Outlier Check", outer=TRUE)
    na2 <- sum(is.na(var_name))
    cat("Outliers identified:", na2 - na1, "\n")
    cat("Propotion (%) of outliers:", round((na2 - na1) / sum(!is.na(var_name))*100, 1), "\n")
    cat("Mean of the outliers:", round(mo, 2), "\n")
    m2 <- mean(var_name, na.rm = T)
    cat("Mean without removing outliers:", round(m1, 2), "\n")
    cat("Mean if we remove outliers:", round(m2, 2), "\n")
  }

  # response <- readline(prompt="Do you want to remove outliers and to replace with NA? [yes/no]: ")
  # if(response == "y" | response == "yes"){
  if(rm){
      dt[as.character(substitute(var))] <- invisible(var_name)
      #assign(as.character(as.list(match.call())$dt), dt, envir = .GlobalEnv)
      cat("Outliers successfully removed", "\n")
      return(invisible(dt))
  } else {
      cat("Nothing changed", "\n")
      return(invisible(df))
  }
}

Research Topic:

There is considerable evidence indicating lending disparities throughout the United States. (Steil et. al: 2018). For our research topic, we will explore lending practices in one of the fastest appreciating real estate markets over the past thirty years - the state of California. Specifically, we aim to look at the factors that are associated with denials for non-commercial mortgage loans.

Our SMART question is:

“Which factors drove denials for mortgages in California in 2019?”

To answer this question, we are using the Federal Financial Institutions Examination Council’s (FFIEC) Home Mortgage Disclosure Act (HMDA) dataset from 2019, located here: https://ffiec.cfpb.gov/data-publication/dynamic-national-loan-level-dataset/2019. We are focusing on a subset of data of 10,000 observations from 2019 that we will further filter on California leaving us with 5,196 observations.

Our Github repository address is:https://github.com/brandonchin19/Team3/.

Load in dataset

hmda_ca <- data.frame(read.csv("hmda_ca_new.csv"))
str(hmda_ca)
## 'data.frame':    50000 obs. of  99 variables:
##  $ activity_year                           : int  2019 2019 2019 2019 2019 2019 2019 2019 2019 2019 ...
##  $ lei                                     : chr  "549300HN58ONH5KNJJ12" "549300ZPP46PTHQ2RF22" "549300ZPP46PTHQ2RF22" "549300ZPP46PTHQ2RF22" ...
##  $ derived_msa.md                          : int  31084 42220 42220 42220 42220 42220 42220 42220 42220 39820 ...
##  $ state_code                              : chr  "CA" "CA" "CA" "CA" ...
##  $ county_code                             : int  6037 6097 6097 6097 6097 6097 6097 6097 6097 6089 ...
##  $ census_tract                            : num  6.04e+09 6.10e+09 6.10e+09 6.10e+09 6.10e+09 ...
##  $ conforming_loan_limit                   : chr  "NC" "NC" "C" "C" ...
##  $ derived_loan_product_type               : chr  "Conventional:First Lien" "Conventional:First Lien" "Conventional:First Lien" "Conventional:First Lien" ...
##  $ derived_dwelling_category               : chr  "Single Family (1-4 Units):Site-Built" "Single Family (1-4 Units):Site-Built" "Single Family (1-4 Units):Site-Built" "Single Family (1-4 Units):Site-Built" ...
##  $ derived_ethnicity                       : chr  "Not Hispanic or Latino" "Not Hispanic or Latino" "Not Hispanic or Latino" "Not Hispanic or Latino" ...
##  $ derived_race                            : chr  "White" "White" "White" "Race Not Available" ...
##  $ derived_sex                             : chr  "Male" "Male" "Male" "Female" ...
##  $ action_taken                            : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ purchaser_type                          : int  9 71 71 71 71 71 71 71 71 71 ...
##  $ preapproval                             : int  1 2 2 2 2 2 2 2 2 2 ...
##  $ loan_type                               : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ loan_purpose                            : int  1 32 1 1 32 32 1 31 32 31 ...
##  $ lien_status                             : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ reverse_mortgage                        : int  2 1111 1111 1111 1111 1111 1111 1111 1111 1111 ...
##  $ open.end_line_of_credit                 : int  2 1111 1111 1111 1111 1111 1111 1111 1111 1111 ...
##  $ business_or_commercial_purpose          : int  2 1111 1111 1111 1111 1111 1111 1111 1111 1111 ...
##  $ loan_amount                             : int  1125000 875000 375000 605000 1335000 405000 105000 265000 355000 185000 ...
##  $ loan_to_value_ratio                     : chr  "80" "Exempt" "Exempt" "Exempt" ...
##  $ interest_rate                           : chr  "5.25" "Exempt" "Exempt" "Exempt" ...
##  $ rate_spread                             : chr  NA "Exempt" "Exempt" "Exempt" ...
##  $ hoepa_status                            : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ total_loan_costs                        : chr  "16765" "Exempt" "Exempt" "Exempt" ...
##  $ total_points_and_fees                   : chr  NA "Exempt" "Exempt" "Exempt" ...
##  $ origination_charges                     : chr  "11200" "Exempt" "Exempt" "Exempt" ...
##  $ discount_points                         : chr  NA "Exempt" "Exempt" "Exempt" ...
##  $ lender_credits                          : chr  NA "Exempt" "Exempt" "Exempt" ...
##  $ loan_term                               : chr  "360" "Exempt" "Exempt" "Exempt" ...
##  $ prepayment_penalty_term                 : chr  NA "Exempt" "Exempt" "Exempt" ...
##  $ intro_rate_period                       : chr  NA "Exempt" "Exempt" "Exempt" ...
##  $ negative_amortization                   : int  2 1111 1111 1111 1111 1111 1111 1111 1111 1111 ...
##  $ interest_only_payment                   : int  2 1111 1111 1111 1111 1111 1111 1111 1111 1111 ...
##  $ balloon_payment                         : int  2 1111 1111 1111 1111 1111 1111 1111 1111 1111 ...
##  $ other_nonamortizing_features            : int  2 1111 1111 1111 1111 1111 1111 1111 1111 1111 ...
##  $ property_value                          : chr  "1405000" "Exempt" "Exempt" "Exempt" ...
##  $ construction_method                     : int  1 1 1 1 1 1 2 1 1 1 ...
##  $ occupancy_type                          : int  1 1 1 1 1 1 1 1 1 2 ...
##  $ manufactured_home_secured_property_type : int  3 1111 1111 1111 1111 1111 1111 1111 1111 1111 ...
##  $ manufactured_home_land_property_interest: int  5 1111 1111 1111 1111 1111 1111 1111 1111 1111 ...
##  $ total_units                             : chr  "1" "1" "1" "1" ...
##  $ multifamily_affordable_units            : chr  NA "Exempt" "Exempt" "Exempt" ...
##  $ income                                  : int  432 254 109 134 363 140 56 60 61 355 ...
##  $ debt_to_income_ratio                    : chr  "20%-<30%" "Exempt" "Exempt" "Exempt" ...
##  $ applicant_credit_score_type             : int  2 1111 1111 1111 1111 1111 1111 1111 1111 1111 ...
##  $ co.applicant_credit_score_type          : int  10 1111 1111 1111 1111 1111 1111 1111 1111 1111 ...
##  $ applicant_ethnicity.1                   : int  2 2 2 2 2 1 1 1 2 2 ...
##  $ applicant_ethnicity.2                   : int  NA NA NA NA NA 2 NA NA NA NA ...
##  $ applicant_ethnicity.3                   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ applicant_ethnicity.4                   : logi  NA NA NA NA NA NA ...
##  $ applicant_ethnicity.5                   : logi  NA NA NA NA NA NA ...
##  $ co.applicant_ethnicity.1                : int  5 5 5 5 2 1 1 5 5 2 ...
##  $ co.applicant_ethnicity.2                : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ co.applicant_ethnicity.3                : logi  NA NA NA NA NA NA ...
##  $ co.applicant_ethnicity.4                : logi  NA NA NA NA NA NA ...
##  $ co.applicant_ethnicity.5                : logi  NA NA NA NA NA NA ...
##  $ applicant_ethnicity_observed            : int  2 1 3 3 3 3 2 3 3 3 ...
##  $ co.applicant_ethnicity_observed         : int  4 4 4 4 3 3 2 4 4 3 ...
##  $ applicant_race.1                        : int  5 5 5 6 5 6 5 5 5 5 ...
##  $ applicant_race.2                        : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ applicant_race.3                        : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ applicant_race.4                        : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ applicant_race.5                        : logi  NA NA NA NA NA NA ...
##  $ co.applicant_race.1                     : int  8 8 8 8 5 6 5 8 8 5 ...
##  $ co.applicant_race.2                     : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ co.applicant_race.3                     : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ co.applicant_race.4                     : logi  NA NA NA NA NA NA ...
##  $ co.applicant_race.5                     : logi  NA NA NA NA NA NA ...
##  $ applicant_race_observed                 : int  2 1 3 3 3 3 2 3 3 3 ...
##  $ co.applicant_race_observed              : int  4 4 4 4 3 3 2 4 4 3 ...
##  $ applicant_sex                           : int  1 1 1 2 1 1 2 2 2 1 ...
##  $ co.applicant_sex                        : int  5 5 5 5 1 2 1 5 5 2 ...
##  $ applicant_sex_observed                  : int  2 1 3 3 3 3 2 3 3 3 ...
##  $ co.applicant_sex_observed               : int  4 4 4 4 3 3 2 4 4 3 ...
##  $ applicant_age                           : chr  "35-44" "65-74" "35-44" "25-34" ...
##  $ co.applicant_age                        : chr  "9999" "9999" "9999" "9999" ...
##  $ applicant_age_above_62                  : chr  "No" "Yes" "No" "No" ...
##  $ co.applicant_age_above_62               : chr  NA NA NA NA ...
##  $ submission_of_application               : int  1 1111 1111 1111 1111 1111 1111 1111 1111 1111 ...
##  $ initially_payable_to_institution        : int  2 1111 1111 1111 1111 1111 1111 1111 1111 1111 ...
##  $ aus.1                                   : int  6 1111 1111 1111 1111 1111 1111 1111 1111 1111 ...
##  $ aus.2                                   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ aus.3                                   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ aus.4                                   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ aus.5                                   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ denial_reason.1                         : int  10 1111 1111 1111 1111 1111 1111 1111 1111 1111 ...
##  $ denial_reason.2                         : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ denial_reason.3                         : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ denial_reason.4                         : logi  NA NA NA NA NA NA ...
##  $ tract_population                        : int  4162 4094 12736 5746 4286 12736 3272 4309 3932 1581 ...
##  $ tract_minority_population_percent       : num  16.2 20.1 58.6 33.8 14.6 ...
##  $ ffiec_msa_md_median_family_income       : int  73100 93300 93300 93300 93300 93300 93300 93300 93300 61900 ...
##  $ tract_to_msa_income_percentage          : int  249 124 79 118 94 79 101 122 76 90 ...
##  $ tract_owner_occupied_units              : int  1369 1211 1960 1430 1435 1960 790 1249 1048 516 ...
##  $ tract_one_to_four_family_homes          : int  1635 1801 3422 2173 2459 3422 1235 1562 1884 1094 ...
##  $ tract_median_age_of_housing_units       : int  56 40 26 29 57 26 33 29 54 41 ...

Check number of rows

dim(hmda_ca)
## [1] 50000    99

Check head and tail of dataframe

xkabledplyhead(hmda_ca,5)
Head
activity_year lei derived_msa.md state_code county_code census_tract conforming_loan_limit derived_loan_product_type derived_dwelling_category derived_ethnicity derived_race derived_sex action_taken purchaser_type preapproval loan_type loan_purpose lien_status reverse_mortgage open.end_line_of_credit business_or_commercial_purpose loan_amount loan_to_value_ratio interest_rate rate_spread hoepa_status total_loan_costs total_points_and_fees origination_charges discount_points lender_credits loan_term prepayment_penalty_term intro_rate_period negative_amortization interest_only_payment balloon_payment other_nonamortizing_features property_value construction_method occupancy_type manufactured_home_secured_property_type manufactured_home_land_property_interest total_units multifamily_affordable_units income debt_to_income_ratio applicant_credit_score_type co.applicant_credit_score_type applicant_ethnicity.1 applicant_ethnicity.2 applicant_ethnicity.3 applicant_ethnicity.4 applicant_ethnicity.5 co.applicant_ethnicity.1 co.applicant_ethnicity.2 co.applicant_ethnicity.3 co.applicant_ethnicity.4 co.applicant_ethnicity.5 applicant_ethnicity_observed co.applicant_ethnicity_observed applicant_race.1 applicant_race.2 applicant_race.3 applicant_race.4 applicant_race.5 co.applicant_race.1 co.applicant_race.2 co.applicant_race.3 co.applicant_race.4 co.applicant_race.5 applicant_race_observed co.applicant_race_observed applicant_sex co.applicant_sex applicant_sex_observed co.applicant_sex_observed applicant_age co.applicant_age applicant_age_above_62 co.applicant_age_above_62 submission_of_application initially_payable_to_institution aus.1 aus.2 aus.3 aus.4 aus.5 denial_reason.1 denial_reason.2 denial_reason.3 denial_reason.4 tract_population tract_minority_population_percent ffiec_msa_md_median_family_income tract_to_msa_income_percentage tract_owner_occupied_units tract_one_to_four_family_homes tract_median_age_of_housing_units
2019 549300HN58ONH5KNJJ12 31084 CA 6037 6.04e+09 NC Conventional:First Lien Single Family (1-4 Units):Site-Built Not Hispanic or Latino White Male 1 9 1 1 1 1 2 2 2 1125000 80 5.25 NA 2 16765 NA 11200 NA NA 360 NA NA 2 2 2 2 1405000 1 1 3 5 1 NA 432 20%-<30% 2 10 2 NA NA NA NA 5 NA NA NA NA 2 4 5 NA NA NA NA 8 NA NA NA NA 2 4 1 5 2 4 35-44 9999 No NA 1 2 6 NA NA NA NA 10 NA NA NA 4162 16.2 73100 249 1369 1635 56
2019 549300ZPP46PTHQ2RF22 42220 CA 6097 6.10e+09 NC Conventional:First Lien Single Family (1-4 Units):Site-Built Not Hispanic or Latino White Male 1 71 2 1 32 1 1111 1111 1111 875000 Exempt Exempt Exempt 2 Exempt Exempt Exempt Exempt Exempt Exempt Exempt Exempt 1111 1111 1111 1111 Exempt 1 1 1111 1111 1 Exempt 254 Exempt 1111 1111 2 NA NA NA NA 5 NA NA NA NA 1 4 5 NA NA NA NA 8 NA NA NA NA 1 4 1 5 1 4 65-74 9999 Yes NA 1111 1111 1111 NA NA NA NA 1111 NA NA NA 4094 20.1 93300 124 1211 1801 40
2019 549300ZPP46PTHQ2RF22 42220 CA 6097 6.10e+09 C Conventional:First Lien Single Family (1-4 Units):Site-Built Not Hispanic or Latino White Male 1 71 2 1 1 1 1111 1111 1111 375000 Exempt Exempt Exempt 2 Exempt Exempt Exempt Exempt Exempt Exempt Exempt Exempt 1111 1111 1111 1111 Exempt 1 1 1111 1111 1 Exempt 109 Exempt 1111 1111 2 NA NA NA NA 5 NA NA NA NA 3 4 5 NA NA NA NA 8 NA NA NA NA 3 4 1 5 3 4 35-44 9999 No NA 1111 1111 1111 NA NA NA NA 1111 NA NA NA 12736 58.6 93300 79 1960 3422 26
2019 549300ZPP46PTHQ2RF22 42220 CA 6097 6.10e+09 C Conventional:First Lien Single Family (1-4 Units):Site-Built Not Hispanic or Latino Race Not Available Female 1 71 2 1 1 1 1111 1111 1111 605000 Exempt Exempt Exempt 2 Exempt Exempt Exempt Exempt Exempt Exempt Exempt Exempt 1111 1111 1111 1111 Exempt 1 1 1111 1111 1 Exempt 134 Exempt 1111 1111 2 NA NA NA NA 5 NA NA NA NA 3 4 6 NA NA NA NA 8 NA NA NA NA 3 4 2 5 3 4 25-34 9999 No NA 1111 1111 1111 NA NA NA NA 1111 NA NA NA 5746 33.8 93300 118 1430 2173 29
2019 549300ZPP46PTHQ2RF22 42220 CA 6097 6.10e+09 NC Conventional:First Lien Single Family (1-4 Units):Site-Built Not Hispanic or Latino White Male 1 71 2 1 32 1 1111 1111 1111 1335000 Exempt Exempt Exempt 2 Exempt Exempt Exempt Exempt Exempt Exempt Exempt Exempt 1111 1111 1111 1111 Exempt 1 1 1111 1111 1 Exempt 363 Exempt 1111 1111 2 NA NA NA NA 2 NA NA NA NA 3 3 5 NA NA NA NA 5 NA NA NA NA 3 3 1 1 3 3 55-64 55-64 No No 1111 1111 1111 NA NA NA NA 1111 NA NA NA 4286 14.6 93300 94 1435 2459 57
xkabledplytail(hmda_ca,5)
Tail
activity_year lei derived_msa.md state_code county_code census_tract conforming_loan_limit derived_loan_product_type derived_dwelling_category derived_ethnicity derived_race derived_sex action_taken purchaser_type preapproval loan_type loan_purpose lien_status reverse_mortgage open.end_line_of_credit business_or_commercial_purpose loan_amount loan_to_value_ratio interest_rate rate_spread hoepa_status total_loan_costs total_points_and_fees origination_charges discount_points lender_credits loan_term prepayment_penalty_term intro_rate_period negative_amortization interest_only_payment balloon_payment other_nonamortizing_features property_value construction_method occupancy_type manufactured_home_secured_property_type manufactured_home_land_property_interest total_units multifamily_affordable_units income debt_to_income_ratio applicant_credit_score_type co.applicant_credit_score_type applicant_ethnicity.1 applicant_ethnicity.2 applicant_ethnicity.3 applicant_ethnicity.4 applicant_ethnicity.5 co.applicant_ethnicity.1 co.applicant_ethnicity.2 co.applicant_ethnicity.3 co.applicant_ethnicity.4 co.applicant_ethnicity.5 applicant_ethnicity_observed co.applicant_ethnicity_observed applicant_race.1 applicant_race.2 applicant_race.3 applicant_race.4 applicant_race.5 co.applicant_race.1 co.applicant_race.2 co.applicant_race.3 co.applicant_race.4 co.applicant_race.5 applicant_race_observed co.applicant_race_observed applicant_sex co.applicant_sex applicant_sex_observed co.applicant_sex_observed applicant_age co.applicant_age applicant_age_above_62 co.applicant_age_above_62 submission_of_application initially_payable_to_institution aus.1 aus.2 aus.3 aus.4 aus.5 denial_reason.1 denial_reason.2 denial_reason.3 denial_reason.4 tract_population tract_minority_population_percent ffiec_msa_md_median_family_income tract_to_msa_income_percentage tract_owner_occupied_units tract_one_to_four_family_homes tract_median_age_of_housing_units
49996 2019 5493005LE3FE2MIPPW66 31084 CA 6037 6.04e+09 C Conventional:First Lien Single Family (1-4 Units):Site-Built Not Hispanic or Latino Black or African American Male 1 6 2 1 31 1 2 2 2 85000 15.385 4.875 0.882 2 3169 NA 1760 NA NA 360 NA NA 2 2 2 2 525000 1 1 3 5 1 NA 30 36 7 10 2 NA NA NA NA 5 NA NA NA NA 1 4 3 NA NA NA NA 8 NA NA NA NA 1 4 1 5 1 4 55-64 9999 No NA 1 1 1 NA NA NA NA 10 NA NA NA 5569 79.3 73100 129 1100 1330 59
49997 2019 5493005LE3FE2MIPPW66 36084 CA 6013 6.01e+09 C Conventional:First Lien Single Family (1-4 Units):Site-Built Not Hispanic or Latino Asian Joint 1 6 2 1 32 1 2 2 2 615000 73.653 4.875 0.588 2 4054 NA 1760 NA 3834 360 NA NA 2 2 2 2 835000 1 1 3 5 1 NA 232 20%-<30% 7 7 2 NA NA NA NA 2 NA NA NA NA 2 2 2 21 NA NA NA 2 21 NA NA NA 2 2 1 2 2 2 45-54 35-44 No No 1 1 1 NA NA NA NA 10 NA NA NA 7492 80.7 111700 214 1682 1892 9
49998 2019 5493005LE3FE2MIPPW66 31084 CA 6037 6.04e+09 C Conventional:First Lien Single Family (1-4 Units):Site-Built Not Hispanic or Latino Black or African American Male 1 6 2 1 32 1 2 2 2 385000 79.835 4.875 0.854 2 12654 NA 10490 8730 NA 360 NA NA 2 2 2 2 485000 1 1 3 5 1 NA 83 38 7 10 2 NA NA NA NA 5 NA NA NA NA 1 4 3 NA NA NA NA 8 NA NA NA NA 1 4 1 5 1 4 55-64 9999 No NA 1 1 1 NA NA NA NA 10 NA NA NA 4676 97.0 73100 73 389 877 63
49999 2019 5493005LE3FE2MIPPW66 33700 CA 6099 6.10e+09 C Conventional:First Lien Single Family (1-4 Units):Site-Built Not Hispanic or Latino White Male 1 6 2 1 1 1 2 2 2 135000 80 5.49 1.4 2 3708.5 NA 1760 NA NA 360 NA NA 2 2 2 2 175000 1 1 3 5 1 NA 60 30%-<36% 7 10 2 NA NA NA NA 5 NA NA NA NA 2 4 5 NA NA NA NA 8 NA NA NA NA 2 4 1 5 2 4 25-34 9999 No NA 1 1 1 NA NA NA NA 10 NA NA NA 4824 47.5 64500 74 788 1396 49
50000 2019 5493005LE3FE2MIPPW66 11244 CA 6059 6.06e+09 C Conventional:First Lien Single Family (1-4 Units):Site-Built Not Hispanic or Latino White Male 1 71 2 1 32 1 2 2 2 155000 24.59 5.25 1.256 2 6108 NA 4039 2279 NA 360 NA NA 2 2 2 2 615000 1 1 3 5 1 NA 66 <20% 7 10 2 NA NA NA NA 5 NA NA NA NA 1 4 5 NA NA NA NA 8 NA NA NA NA 1 4 1 5 1 4 65-74 9999 Yes NA 1 1 1 NA NA NA NA 10 NA NA NA 5706 70.4 97900 65 734 1117 51

Subsetting to California Only; non-business properties; principal residences only

Removed “msa = derived_msa_md,” because of error

hmda_ca <- subset(hmda_ca,business_or_commercial_purpose=="2")
str(hmda_ca)
## 'data.frame':    48030 obs. of  99 variables:
##  $ activity_year                           : int  2019 2019 2019 2019 2019 2019 2019 2019 2019 2019 ...
##  $ lei                                     : chr  "549300HN58ONH5KNJJ12" "254900WTZC5SSKIN2M11" "254900WTZC5SSKIN2M11" "254900WTZC5SSKIN2M11" ...
##  $ derived_msa.md                          : int  31084 99999 23420 40140 40140 31084 41940 99999 23420 36084 ...
##  $ state_code                              : chr  "CA" "CA" "CA" "CA" ...
##  $ county_code                             : int  6037 6033 6019 6065 6071 6037 6085 6105 6019 6013 ...
##  $ census_tract                            : num  6.04e+09 6.03e+09 6.02e+09 6.07e+09 6.07e+09 ...
##  $ conforming_loan_limit                   : chr  "NC" "C" "C" "C" ...
##  $ derived_loan_product_type               : chr  "Conventional:First Lien" "Conventional:First Lien" "Conventional:First Lien" "Conventional:First Lien" ...
##  $ derived_dwelling_category               : chr  "Single Family (1-4 Units):Site-Built" "Single Family (1-4 Units):Manufactured" "Single Family (1-4 Units):Site-Built" "Single Family (1-4 Units):Site-Built" ...
##  $ derived_ethnicity                       : chr  "Not Hispanic or Latino" "Ethnicity Not Available" "Hispanic or Latino" "Hispanic or Latino" ...
##  $ derived_race                            : chr  "White" "White" "Race Not Available" "White" ...
##  $ derived_sex                             : chr  "Male" "Male" "Joint" "Male" ...
##  $ action_taken                            : int  1 4 1 1 3 1 2 3 4 1 ...
##  $ purchaser_type                          : int  9 0 0 0 0 0 0 0 0 0 ...
##  $ preapproval                             : int  1 2 2 2 2 2 2 2 2 2 ...
##  $ loan_type                               : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ loan_purpose                            : int  1 4 31 31 1 32 32 31 1 31 ...
##  $ lien_status                             : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ reverse_mortgage                        : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ open.end_line_of_credit                 : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ business_or_commercial_purpose          : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ loan_amount                             : int  1125000 165000 175000 315000 115000 165000 575000 115000 205000 265000 ...
##  $ loan_to_value_ratio                     : chr  "80" NA "74.09" "74.73" ...
##  $ interest_rate                           : chr  "5.25" NA "7" "6.625" ...
##  $ rate_spread                             : chr  NA NA "2.45" "2.054" ...
##  $ hoepa_status                            : int  2 3 2 2 3 2 3 3 3 2 ...
##  $ total_loan_costs                        : chr  "16765" NA "6121.9" "12961.2" ...
##  $ total_points_and_fees                   : chr  NA NA NA NA ...
##  $ origination_charges                     : chr  "11200" NA "4535" "10985" ...
##  $ discount_points                         : chr  NA NA NA NA ...
##  $ lender_credits                          : chr  NA NA "500" "650" ...
##  $ loan_term                               : chr  "360" "240" "360" "360" ...
##  $ prepayment_penalty_term                 : chr  NA NA NA NA ...
##  $ intro_rate_period                       : chr  NA NA NA NA ...
##  $ negative_amortization                   : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ interest_only_payment                   : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ balloon_payment                         : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ other_nonamortizing_features            : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ property_value                          : chr  "1405000" NA "235000" "415000" ...
##  $ construction_method                     : int  1 2 1 1 1 1 1 1 1 1 ...
##  $ occupancy_type                          : int  1 1 1 1 1 1 1 2 1 1 ...
##  $ manufactured_home_secured_property_type : int  3 1 3 3 3 3 3 3 3 3 ...
##  $ manufactured_home_land_property_interest: int  5 1 5 5 5 5 5 5 5 5 ...
##  $ total_units                             : chr  "1" "1" "1" "1" ...
##  $ multifamily_affordable_units            : chr  NA NA NA NA ...
##  $ income                                  : int  432 44 72 146 42 64 129 66 59 48 ...
##  $ debt_to_income_ratio                    : chr  "20%-<30%" NA "20%-<30%" "20%-<30%" ...
##  $ applicant_credit_score_type             : int  2 9 3 3 3 3 3 3 9 3 ...
##  $ co.applicant_credit_score_type          : int  10 9 9 10 9 9 9 10 9 10 ...
##  $ applicant_ethnicity.1                   : int  2 3 1 1 1 1 1 2 1 1 ...
##  $ applicant_ethnicity.2                   : int  NA NA 11 NA 11 11 11 NA NA NA ...
##  $ applicant_ethnicity.3                   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ applicant_ethnicity.4                   : logi  NA NA NA NA NA NA ...
##  $ applicant_ethnicity.5                   : logi  NA NA NA NA NA NA ...
##  $ co.applicant_ethnicity.1                : int  5 5 1 5 1 1 1 5 5 5 ...
##  $ co.applicant_ethnicity.2                : int  NA NA 11 NA 11 11 11 NA NA NA ...
##  $ co.applicant_ethnicity.3                : logi  NA NA NA NA NA NA ...
##  $ co.applicant_ethnicity.4                : logi  NA NA NA NA NA NA ...
##  $ co.applicant_ethnicity.5                : logi  NA NA NA NA NA NA ...
##  $ applicant_ethnicity_observed            : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ co.applicant_ethnicity_observed         : int  4 4 2 4 2 2 2 4 4 4 ...
##  $ applicant_race.1                        : int  5 5 6 5 5 5 5 2 5 5 ...
##  $ applicant_race.2                        : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ applicant_race.3                        : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ applicant_race.4                        : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ applicant_race.5                        : logi  NA NA NA NA NA NA ...
##  $ co.applicant_race.1                     : int  8 8 6 8 5 5 5 8 8 8 ...
##  $ co.applicant_race.2                     : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ co.applicant_race.3                     : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ co.applicant_race.4                     : logi  NA NA NA NA NA NA ...
##  $ co.applicant_race.5                     : logi  NA NA NA NA NA NA ...
##  $ applicant_race_observed                 : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ co.applicant_race_observed              : int  4 4 2 4 2 2 2 4 4 4 ...
##  $ applicant_sex                           : int  1 1 1 1 1 2 1 1 1 1 ...
##  $ co.applicant_sex                        : int  5 5 2 5 2 2 1 5 5 5 ...
##  $ applicant_sex_observed                  : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ co.applicant_sex_observed               : int  4 4 2 4 2 2 2 4 4 4 ...
##  $ applicant_age                           : chr  "35-44" "35-44" "35-44" "35-44" ...
##  $ co.applicant_age                        : chr  "9999" "9999" "35-44" "9999" ...
##  $ applicant_age_above_62                  : chr  "No" "No" "No" "No" ...
##  $ co.applicant_age_above_62               : chr  NA NA "No" NA ...
##  $ submission_of_application               : int  1 1 1 2 2 2 2 1 2 2 ...
##  $ initially_payable_to_institution        : int  2 1 1 1 1 1 1 1 1 1 ...
##  $ aus.1                                   : int  6 6 6 6 6 6 6 6 6 6 ...
##  $ aus.2                                   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ aus.3                                   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ aus.4                                   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ aus.5                                   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ denial_reason.1                         : int  10 10 10 10 4 10 10 4 10 10 ...
##  $ denial_reason.2                         : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ denial_reason.3                         : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ denial_reason.4                         : logi  NA NA NA NA NA NA ...
##  $ tract_population                        : int  4162 3516 5493 5344 6857 7118 6848 2628 5321 6235 ...
##  $ tract_minority_population_percent       : num  16.2 21.7 83.8 47.4 66.7 ...
##  $ ffiec_msa_md_median_family_income       : int  73100 64800 57300 69700 69700 73100 129900 64800 57300 111700 ...
##  $ tract_to_msa_income_percentage          : int  249 100 74 130 61 78 108 89 151 43 ...
##  $ tract_owner_occupied_units              : int  1369 1147 540 1324 1129 1102 1844 876 1355 518 ...
##  $ tract_one_to_four_family_homes          : int  1635 1663 1255 1826 2463 1540 2125 1908 1612 1575 ...
##  $ tract_median_age_of_housing_units       : int  56 32 48 10 54 54 49 39 13 17 ...
#rename(hmda_ca, ethnicity=derived_ethnicity, race=derived_race,
       #sex=derived_sex)
hmda_ca <- subset(hmda_ca,business_or_commercial_purpose=="2")
str(hmda_ca)
## 'data.frame':    48030 obs. of  99 variables:
##  $ activity_year                           : int  2019 2019 2019 2019 2019 2019 2019 2019 2019 2019 ...
##  $ lei                                     : chr  "549300HN58ONH5KNJJ12" "254900WTZC5SSKIN2M11" "254900WTZC5SSKIN2M11" "254900WTZC5SSKIN2M11" ...
##  $ derived_msa.md                          : int  31084 99999 23420 40140 40140 31084 41940 99999 23420 36084 ...
##  $ state_code                              : chr  "CA" "CA" "CA" "CA" ...
##  $ county_code                             : int  6037 6033 6019 6065 6071 6037 6085 6105 6019 6013 ...
##  $ census_tract                            : num  6.04e+09 6.03e+09 6.02e+09 6.07e+09 6.07e+09 ...
##  $ conforming_loan_limit                   : chr  "NC" "C" "C" "C" ...
##  $ derived_loan_product_type               : chr  "Conventional:First Lien" "Conventional:First Lien" "Conventional:First Lien" "Conventional:First Lien" ...
##  $ derived_dwelling_category               : chr  "Single Family (1-4 Units):Site-Built" "Single Family (1-4 Units):Manufactured" "Single Family (1-4 Units):Site-Built" "Single Family (1-4 Units):Site-Built" ...
##  $ derived_ethnicity                       : chr  "Not Hispanic or Latino" "Ethnicity Not Available" "Hispanic or Latino" "Hispanic or Latino" ...
##  $ derived_race                            : chr  "White" "White" "Race Not Available" "White" ...
##  $ derived_sex                             : chr  "Male" "Male" "Joint" "Male" ...
##  $ action_taken                            : int  1 4 1 1 3 1 2 3 4 1 ...
##  $ purchaser_type                          : int  9 0 0 0 0 0 0 0 0 0 ...
##  $ preapproval                             : int  1 2 2 2 2 2 2 2 2 2 ...
##  $ loan_type                               : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ loan_purpose                            : int  1 4 31 31 1 32 32 31 1 31 ...
##  $ lien_status                             : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ reverse_mortgage                        : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ open.end_line_of_credit                 : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ business_or_commercial_purpose          : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ loan_amount                             : int  1125000 165000 175000 315000 115000 165000 575000 115000 205000 265000 ...
##  $ loan_to_value_ratio                     : chr  "80" NA "74.09" "74.73" ...
##  $ interest_rate                           : chr  "5.25" NA "7" "6.625" ...
##  $ rate_spread                             : chr  NA NA "2.45" "2.054" ...
##  $ hoepa_status                            : int  2 3 2 2 3 2 3 3 3 2 ...
##  $ total_loan_costs                        : chr  "16765" NA "6121.9" "12961.2" ...
##  $ total_points_and_fees                   : chr  NA NA NA NA ...
##  $ origination_charges                     : chr  "11200" NA "4535" "10985" ...
##  $ discount_points                         : chr  NA NA NA NA ...
##  $ lender_credits                          : chr  NA NA "500" "650" ...
##  $ loan_term                               : chr  "360" "240" "360" "360" ...
##  $ prepayment_penalty_term                 : chr  NA NA NA NA ...
##  $ intro_rate_period                       : chr  NA NA NA NA ...
##  $ negative_amortization                   : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ interest_only_payment                   : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ balloon_payment                         : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ other_nonamortizing_features            : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ property_value                          : chr  "1405000" NA "235000" "415000" ...
##  $ construction_method                     : int  1 2 1 1 1 1 1 1 1 1 ...
##  $ occupancy_type                          : int  1 1 1 1 1 1 1 2 1 1 ...
##  $ manufactured_home_secured_property_type : int  3 1 3 3 3 3 3 3 3 3 ...
##  $ manufactured_home_land_property_interest: int  5 1 5 5 5 5 5 5 5 5 ...
##  $ total_units                             : chr  "1" "1" "1" "1" ...
##  $ multifamily_affordable_units            : chr  NA NA NA NA ...
##  $ income                                  : int  432 44 72 146 42 64 129 66 59 48 ...
##  $ debt_to_income_ratio                    : chr  "20%-<30%" NA "20%-<30%" "20%-<30%" ...
##  $ applicant_credit_score_type             : int  2 9 3 3 3 3 3 3 9 3 ...
##  $ co.applicant_credit_score_type          : int  10 9 9 10 9 9 9 10 9 10 ...
##  $ applicant_ethnicity.1                   : int  2 3 1 1 1 1 1 2 1 1 ...
##  $ applicant_ethnicity.2                   : int  NA NA 11 NA 11 11 11 NA NA NA ...
##  $ applicant_ethnicity.3                   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ applicant_ethnicity.4                   : logi  NA NA NA NA NA NA ...
##  $ applicant_ethnicity.5                   : logi  NA NA NA NA NA NA ...
##  $ co.applicant_ethnicity.1                : int  5 5 1 5 1 1 1 5 5 5 ...
##  $ co.applicant_ethnicity.2                : int  NA NA 11 NA 11 11 11 NA NA NA ...
##  $ co.applicant_ethnicity.3                : logi  NA NA NA NA NA NA ...
##  $ co.applicant_ethnicity.4                : logi  NA NA NA NA NA NA ...
##  $ co.applicant_ethnicity.5                : logi  NA NA NA NA NA NA ...
##  $ applicant_ethnicity_observed            : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ co.applicant_ethnicity_observed         : int  4 4 2 4 2 2 2 4 4 4 ...
##  $ applicant_race.1                        : int  5 5 6 5 5 5 5 2 5 5 ...
##  $ applicant_race.2                        : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ applicant_race.3                        : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ applicant_race.4                        : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ applicant_race.5                        : logi  NA NA NA NA NA NA ...
##  $ co.applicant_race.1                     : int  8 8 6 8 5 5 5 8 8 8 ...
##  $ co.applicant_race.2                     : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ co.applicant_race.3                     : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ co.applicant_race.4                     : logi  NA NA NA NA NA NA ...
##  $ co.applicant_race.5                     : logi  NA NA NA NA NA NA ...
##  $ applicant_race_observed                 : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ co.applicant_race_observed              : int  4 4 2 4 2 2 2 4 4 4 ...
##  $ applicant_sex                           : int  1 1 1 1 1 2 1 1 1 1 ...
##  $ co.applicant_sex                        : int  5 5 2 5 2 2 1 5 5 5 ...
##  $ applicant_sex_observed                  : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ co.applicant_sex_observed               : int  4 4 2 4 2 2 2 4 4 4 ...
##  $ applicant_age                           : chr  "35-44" "35-44" "35-44" "35-44" ...
##  $ co.applicant_age                        : chr  "9999" "9999" "35-44" "9999" ...
##  $ applicant_age_above_62                  : chr  "No" "No" "No" "No" ...
##  $ co.applicant_age_above_62               : chr  NA NA "No" NA ...
##  $ submission_of_application               : int  1 1 1 2 2 2 2 1 2 2 ...
##  $ initially_payable_to_institution        : int  2 1 1 1 1 1 1 1 1 1 ...
##  $ aus.1                                   : int  6 6 6 6 6 6 6 6 6 6 ...
##  $ aus.2                                   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ aus.3                                   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ aus.4                                   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ aus.5                                   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ denial_reason.1                         : int  10 10 10 10 4 10 10 4 10 10 ...
##  $ denial_reason.2                         : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ denial_reason.3                         : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ denial_reason.4                         : logi  NA NA NA NA NA NA ...
##  $ tract_population                        : int  4162 3516 5493 5344 6857 7118 6848 2628 5321 6235 ...
##  $ tract_minority_population_percent       : num  16.2 21.7 83.8 47.4 66.7 ...
##  $ ffiec_msa_md_median_family_income       : int  73100 64800 57300 69700 69700 73100 129900 64800 57300 111700 ...
##  $ tract_to_msa_income_percentage          : int  249 100 74 130 61 78 108 89 151 43 ...
##  $ tract_owner_occupied_units              : int  1369 1147 540 1324 1129 1102 1844 876 1355 518 ...
##  $ tract_one_to_four_family_homes          : int  1635 1663 1255 1826 2463 1540 2125 1908 1612 1575 ...
##  $ tract_median_age_of_housing_units       : int  56 32 48 10 54 54 49 39 13 17 ...

Subsetting principal residences only; tail and head check to make sure that the geography is widespread/our sample is “random”; there are now 59 distinct counties.

hmda_ca <- subset(hmda_ca,occupancy_type=="1")
dim(hmda_ca) #45735    99
## [1] 45735    99
#xkabledplyhead(hmda_ca,5)
#xkabledplytail(hmda_ca,5)
loadPkg("sqldf")
names(hmda_ca)
##  [1] "activity_year"                           
##  [2] "lei"                                     
##  [3] "derived_msa.md"                          
##  [4] "state_code"                              
##  [5] "county_code"                             
##  [6] "census_tract"                            
##  [7] "conforming_loan_limit"                   
##  [8] "derived_loan_product_type"               
##  [9] "derived_dwelling_category"               
## [10] "derived_ethnicity"                       
## [11] "derived_race"                            
## [12] "derived_sex"                             
## [13] "action_taken"                            
## [14] "purchaser_type"                          
## [15] "preapproval"                             
## [16] "loan_type"                               
## [17] "loan_purpose"                            
## [18] "lien_status"                             
## [19] "reverse_mortgage"                        
## [20] "open.end_line_of_credit"                 
## [21] "business_or_commercial_purpose"          
## [22] "loan_amount"                             
## [23] "loan_to_value_ratio"                     
## [24] "interest_rate"                           
## [25] "rate_spread"                             
## [26] "hoepa_status"                            
## [27] "total_loan_costs"                        
## [28] "total_points_and_fees"                   
## [29] "origination_charges"                     
## [30] "discount_points"                         
## [31] "lender_credits"                          
## [32] "loan_term"                               
## [33] "prepayment_penalty_term"                 
## [34] "intro_rate_period"                       
## [35] "negative_amortization"                   
## [36] "interest_only_payment"                   
## [37] "balloon_payment"                         
## [38] "other_nonamortizing_features"            
## [39] "property_value"                          
## [40] "construction_method"                     
## [41] "occupancy_type"                          
## [42] "manufactured_home_secured_property_type" 
## [43] "manufactured_home_land_property_interest"
## [44] "total_units"                             
## [45] "multifamily_affordable_units"            
## [46] "income"                                  
## [47] "debt_to_income_ratio"                    
## [48] "applicant_credit_score_type"             
## [49] "co.applicant_credit_score_type"          
## [50] "applicant_ethnicity.1"                   
## [51] "applicant_ethnicity.2"                   
## [52] "applicant_ethnicity.3"                   
## [53] "applicant_ethnicity.4"                   
## [54] "applicant_ethnicity.5"                   
## [55] "co.applicant_ethnicity.1"                
## [56] "co.applicant_ethnicity.2"                
## [57] "co.applicant_ethnicity.3"                
## [58] "co.applicant_ethnicity.4"                
## [59] "co.applicant_ethnicity.5"                
## [60] "applicant_ethnicity_observed"            
## [61] "co.applicant_ethnicity_observed"         
## [62] "applicant_race.1"                        
## [63] "applicant_race.2"                        
## [64] "applicant_race.3"                        
## [65] "applicant_race.4"                        
## [66] "applicant_race.5"                        
## [67] "co.applicant_race.1"                     
## [68] "co.applicant_race.2"                     
## [69] "co.applicant_race.3"                     
## [70] "co.applicant_race.4"                     
## [71] "co.applicant_race.5"                     
## [72] "applicant_race_observed"                 
## [73] "co.applicant_race_observed"              
## [74] "applicant_sex"                           
## [75] "co.applicant_sex"                        
## [76] "applicant_sex_observed"                  
## [77] "co.applicant_sex_observed"               
## [78] "applicant_age"                           
## [79] "co.applicant_age"                        
## [80] "applicant_age_above_62"                  
## [81] "co.applicant_age_above_62"               
## [82] "submission_of_application"               
## [83] "initially_payable_to_institution"        
## [84] "aus.1"                                   
## [85] "aus.2"                                   
## [86] "aus.3"                                   
## [87] "aus.4"                                   
## [88] "aus.5"                                   
## [89] "denial_reason.1"                         
## [90] "denial_reason.2"                         
## [91] "denial_reason.3"                         
## [92] "denial_reason.4"                         
## [93] "tract_population"                        
## [94] "tract_minority_population_percent"       
## [95] "ffiec_msa_md_median_family_income"       
## [96] "tract_to_msa_income_percentage"          
## [97] "tract_owner_occupied_units"              
## [98] "tract_one_to_four_family_homes"          
## [99] "tract_median_age_of_housing_units"
sqldf("select count(distinct(county_code)) from hmda_ca")
##   count(distinct(county_code))
## 1                           59
unloadPkg("sqldf")

Subsetting to only relevant actions: denial or approval

hmda_ca1<-hmda_ca%>%filter(action_taken %in% c("1", "3"))
hmda_ca<-hmda_ca1
dim(hmda_ca) #30661    99
## [1] 30661    99

Subsetting relevant variables for answering the SMART question

hmda_ca_final <- hmda_ca[c(10,11,12,13,22,24,39,46,50,62,74,78)]
str(hmda_ca_final)
## 'data.frame':    30661 obs. of  12 variables:
##  $ derived_ethnicity    : chr  "Not Hispanic or Latino" "Hispanic or Latino" "Hispanic or Latino" "Hispanic or Latino" ...
##  $ derived_race         : chr  "White" "Race Not Available" "White" "White" ...
##  $ derived_sex          : chr  "Male" "Joint" "Male" "Joint" ...
##  $ action_taken         : int  1 1 1 3 1 1 1 1 3 1 ...
##  $ loan_amount          : int  1125000 175000 315000 115000 165000 265000 445000 335000 315000 285000 ...
##  $ interest_rate        : chr  "5.25" "7" "6.625" NA ...
##  $ property_value       : chr  "1405000" "235000" "415000" "145000" ...
##  $ income               : int  432 72 146 42 64 48 101 125 71 78 ...
##  $ applicant_ethnicity.1: int  2 1 1 1 1 1 3 1 1 1 ...
##  $ applicant_race.1     : int  5 6 5 5 5 5 6 5 5 5 ...
##  $ applicant_sex        : int  1 1 1 1 2 1 2 1 1 1 ...
##  $ applicant_age        : chr  "35-44" "35-44" "35-44" "25-34" ...

Changing vector types

hmda_ca_final_1 = hmda_ca_final
hmda_ca_final_1$derived_ethnicity = factor(hmda_ca_final$derived_ethnicity)
hmda_ca_final_1$derived_race = factor(hmda_ca_final$derived_race)
hmda_ca_final_1$derived_sex = factor(hmda_ca_final$derived_sex)
#hmda_ca_final_1$action_taken = factor(hmda_ca_final$action_taken)
hmda_ca_final_1$loan_amount = as.numeric(hmda_ca_final$loan_amount)
hmda_ca_final_1$interest_rate = as.numeric(hmda_ca_final$interest_rate)
hmda_ca_final_1$property_value = as.numeric(hmda_ca_final$property_value)
hmda_ca_final_1$income = as.numeric(hmda_ca_final$income)
hmda_ca_final_1$applicant_age = factor(hmda_ca_final$applicant_age)
str(hmda_ca_final_1)
## 'data.frame':    30661 obs. of  12 variables:
##  $ derived_ethnicity    : Factor w/ 4 levels "Ethnicity Not Available",..: 4 2 2 2 2 2 1 2 2 2 ...
##  $ derived_race         : Factor w/ 8 levels "2 or more minority races",..: 8 7 8 8 8 8 7 8 8 8 ...
##  $ derived_sex          : Factor w/ 4 levels "Female","Joint",..: 3 2 3 2 1 3 1 3 3 3 ...
##  $ action_taken         : int  1 1 1 3 1 1 1 1 3 1 ...
##  $ loan_amount          : num  1125000 175000 315000 115000 165000 ...
##  $ interest_rate        : num  5.25 7 6.62 NA 6.75 ...
##  $ property_value       : num  1405000 235000 415000 145000 435000 ...
##  $ income               : num  432 72 146 42 64 48 101 125 71 78 ...
##  $ applicant_ethnicity.1: int  2 1 1 1 1 1 3 1 1 1 ...
##  $ applicant_race.1     : int  5 6 5 5 5 5 6 5 5 5 ...
##  $ applicant_sex        : int  1 1 1 1 2 1 2 1 1 1 ...
##  $ applicant_age        : Factor w/ 8 levels "<25",">74","25-34",..: 4 4 4 3 4 4 4 5 5 4 ...

#Examine and filter out the missing values

missvalue <- is.na(hmda_ca_final_1)
summary(missvalue)
##  derived_ethnicity derived_race    derived_sex     action_taken   
##  Mode :logical     Mode :logical   Mode :logical   Mode :logical  
##  FALSE:30661       FALSE:30661     FALSE:30661     FALSE:30661    
##                                                                   
##  loan_amount     interest_rate   property_value    income       
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:30661     FALSE:23742     FALSE:28927     FALSE:29359    
##                  TRUE :6919      TRUE :1734      TRUE :1302     
##  applicant_ethnicity.1 applicant_race.1 applicant_sex   applicant_age  
##  Mode :logical         Mode :logical    Mode :logical   Mode :logical  
##  FALSE:30661           FALSE:30661      FALSE:30661     FALSE:30661    
## 
#There are 30,661 observations after we've filtered on all of the relevant fields. 
#Missing values are present in interest_rate: 6,919; property_value: 1,734, and income: 1,302.
hmda_ca_final_2 <- hmda_ca_final_1$interest_rate[is.na(hmda_ca_final_1$interest_rate)] <- mean(hmda_ca_final_1$interest_rate, na.rm = TRUE)
hmda_ca_final_2 <- hmda_ca_final_1 %>% drop_na(applicant_ethnicity.1)
hmda_ca_final_2 <- na.omit(hmda_ca_final_1, cols="income")
missvalue1 <- is.na(hmda_ca_final_2)
summary(missvalue1)
##  derived_ethnicity derived_race    derived_sex     action_taken   
##  Mode :logical     Mode :logical   Mode :logical   Mode :logical  
##  FALSE:28695       FALSE:28695     FALSE:28695     FALSE:28695    
##  loan_amount     interest_rate   property_value    income       
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:28695     FALSE:28695     FALSE:28695     FALSE:28695    
##  applicant_ethnicity.1 applicant_race.1 applicant_sex   applicant_age  
##  Mode :logical         Mode :logical    Mode :logical   Mode :logical  
##  FALSE:28695           FALSE:28695      FALSE:28695     FALSE:28695
#After cleaning out the missing values, we are left with 28,695 observations

VI. Exploratory Data Analysis

#Exploring the categorical values first

library(ggplot2)
ggplot(hmda_ca_final_2, aes(x = factor(derived_sex))) +
    geom_bar(color="black", fill="antiquewhite2")+
  labs(title="Graph 1. Applicant sex distribution", x="Applicant Sex", y="Count")

ggplot(hmda_ca_final_2, aes(x = factor(applicant_age))) +
    geom_bar(color="black", fill="bisque3")+
  labs(title="Graph 2. Applicant age distribution", x="Applicant Age", y="Count")

ggplot(hmda_ca_final_2, aes(x = factor(derived_ethnicity))) +
    geom_bar(color="black", fill="cornsilk3")+
  labs(title="Graph 3. Applicant ethnicity distribution", x="Applicant Ethnicity", y="Count")

ggplot(hmda_ca_final_2, aes(x = factor(action_taken))) +
    geom_bar(color="black", fill="azure3")+
  labs(title="Graph 4. Action taken distribution", x="Action Taken", y="Count")

ggplot(hmda_ca_final_2, aes(x = factor(derived_race))) +
    geom_bar(color="black", fill="azure3")+
  labs(title="Graph 5. Derived Race", x="Race of the Applicant", y= "Count")+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

Next, we proceed to exploring the numerical data for normality and outliers. Graphs 8-10 indicate that none of the numerical variables are normally distributed.

ggplot(hmda_ca_final_2,aes(x=loan_amount))+
  geom_histogram(color="black", fill="steelblue")+
  labs(title=" Graph 6. Histogram of Loan Amount", x="Loan Amount in dollars", y="Frequency")+theme_bw()

ggplot(hmda_ca_final_2,aes(x=interest_rate))+
  geom_histogram(color="black", fill="pink")+
  labs(title="Graph 8. Histogram of Interest Rate", x="Interest Rates", y="Frequency")+theme_bw()

ggplot(hmda_ca_final_2,aes(x=income))+
  geom_histogram(color="black", fill="azure3")+
  labs(title="Graph 9. Histogram of Income", x="Income in dollars", y="Frequency")+theme_bw()

ggplot(hmda_ca_final_2,aes(x=property_value))+
  geom_histogram(color="black", fill="lightblue")+
  labs(title="Graph 10. Histogram of Property Values", x="Property Values in dollars", y="Frequency")+theme_bw()

As another check, we inspect qqnorm plots and confirm that the distributions are not normal(Graph 11-14)

qqnorm(hmda_ca_final_2$interest_rate,
       main="Graph 11. QQ Plot of Interest Rates",
       ylab="Interest Rate",
       col="pink")
qqline(hmda_ca_final_2$interest_rate)

qqnorm(hmda_ca_final_2$income,
       main="Graph 13. QQ Plot of Income",
       ylab="Income",
       col="green")
qqline(hmda_ca_final_2$income)

qqnorm(hmda_ca_final_2$property_value,
       main="Graph 12. QQ Plot of Property Values",
       ylab="Property Value",
       col="blue")
qqline(hmda_ca_final_2$property_value)

qqnorm(hmda_ca_final_2$loan_amount,
       main="Graph 14.QQ Plot of Loan Amount",
       ylab="Loan Amount",
       col="purple")
qqline(hmda_ca_final_1$loan_amount)

Next, we remove outliers and check for normality again.

hmda_ca_final_no_outliers_3 <- outlierKD2(hmda_ca_final_2,interest_rate)

## Outliers identified: 2664 
## Propotion (%) of outliers: 10.2 
## Mean of the outliers: 7.69 
## Mean without removing outliers: 4.49 
## Mean if we remove outliers: 4.16 
## Outliers successfully removed
hmda_ca_final_no_outliers_4 <- outlierKD2(hmda_ca_final_no_outliers_3,property_value)

## Outliers identified: 1821 
## Propotion (%) of outliers: 6.8 
## Mean of the outliers: 2132309 
## Mean without removing outliers: 644638 
## Mean if we remove outliers: 543833 
## Outliers successfully removed
hmda_ca_final_no_outliers_5 <- outlierKD2(hmda_ca_final_no_outliers_4,income)

## Outliers identified: 1110 
## Propotion (%) of outliers: 4 
## Mean of the outliers: 410 
## Mean without removing outliers: 118 
## Mean if we remove outliers: 107 
## Outliers successfully removed
loans <- outlierKD2(hmda_ca_final_no_outliers_5,loan_amount)

## Outliers identified: 451 
## Propotion (%) of outliers: 1.6 
## Mean of the outliers: 1397129 
## Mean without removing outliers: 332997 
## Mean if we remove outliers: 316005 
## Outliers successfully removed

We then reexamine the variables for normality after removing the outliers.

qqnorm(loans$interest_rate,
       main="Graph 15. QQ Plot of Interest Rates",
       ylab="Interest Rate",
       col="pink")
qqline(loans$interest_rate)

qqnorm(loans$property_value,
       main="Graph 16. QQ Plot of Property Values",
       ylab="Property Value",
       col="blue")
qqline(loans$property_value)

qqnorm(loans$income,
       main="Graph 17.QQ Plot of Income",
       ylab="Income",
       col="green")
qqline(loans$income)

qqnorm(loans$loan_amount,
       main="Graph 18. QQ Plot of Loan Amount",
       ylab="Loan Amount",
       col="purple")
qqline(loans$loan_amount)

The values do not appear to be normally distributed even after removing the outliers

Numerical_var <- subset(loans,select=c(loan_amount, income, property_value, interest_rate))
#str(Numerical_var)
library(kableExtra)
summary_t<-kbl(summary(Numerical_var))%>%
  kable_styling()
summary_t
loan_amount income property_value interest_rate
Min. : 5000 Min. :-19 Min. : 15000 Min. :2
1st Qu.:125000 1st Qu.: 64 1st Qu.: 385000 1st Qu.:4
Median :305000 Median : 99 Median : 515000 Median :4
Mean :316005 Mean :107 Mean : 543833 Mean :4
3rd Qu.:455000 3rd Qu.:143 3rd Qu.: 665000 3rd Qu.:4
Max. :975000 Max. :276 Max. :1175000 Max. :6
NA’s :451 NA’s :1110 NA’s :1821 NA’s :2664
# GROUP: what's with the NAs in the summary table after we have cleaned the NAs out?

However, the means and the medians are fairly close in all instances; do we keep removing the outliers?

#this chunk is for visual inspection only: EITHER DELETE or FORMAT before the final submission
boxplot(loans$interest_rate)

boxplot(loans$loan_amount)

boxplot(loans$property_value)

boxplot(loans$income)

Starting the tests

library(lattice)
#dim(loans) #28695    12
pairs(loans)

#str(loans)

library(epiDisplay)
tab1(loans$derived_race, sort.group = "decreasing", cum.percent = TRUE)

## loans$derived_race : 
##                                           Frequency Percent Cum. percent
## White                                         17022    59.3         59.3
## Race Not Available                             4295    15.0         74.3
## Asian                                          3496    12.2         86.5
## Black or African American                      1905     6.6         93.1
## Joint                                          1349     4.7         97.8
## American Indian or Alaska Native                258     0.9         98.7
## Native Hawaiian or Other Pacific Islander       243     0.8         99.6
## 2 or more minority races                        127     0.4        100.0
##   Total                                       28695   100.0        100.0

##Preparing data for corrplot

cor_loans<-cor(loans_all_num, use="complete.obs")
xkabledply(cor_loans)
Table
action_taken loan_amount interest_rate property_value income applicant_ethnicity.1 applicant_race.1 applicant_sex applicant_age Black AIAN NHPI Asian Joint
action_taken 1.0000 -0.2118 0.2412 -0.0772 -0.1038 0.0134 0.0085 0.0272 0.0478 0.1045 0.0326 0.0389 0.0323 0.0028
loan_amount -0.2118 1.0000 -0.2426 0.4430 0.4005 0.0252 0.0207 -0.0666 -0.1318 0.0034 -0.0221 -0.0066 0.0861 0.0529
interest_rate 0.2412 -0.2426 1.0000 -0.0796 -0.1274 -0.0007 -0.0143 0.0433 0.0161 0.0430 -0.0019 0.0208 -0.0296 -0.0263
property_value -0.0772 0.4430 -0.0796 1.0000 0.4333 0.0192 0.0091 -0.0035 0.0954 -0.0409 -0.0392 -0.0082 0.0995 0.0297
income -0.1038 0.4005 -0.1274 0.4333 1.0000 0.0284 0.0353 -0.0794 -0.0122 -0.0347 -0.0274 -0.0116 0.0749 0.1145
applicant_ethnicity.1 0.0134 0.0252 -0.0007 0.0192 0.0284 1.0000 0.0463 0.0686 0.0064 -0.0128 0.0016 0.0001 0.0150 -0.0150
applicant_race.1 0.0085 0.0207 -0.0143 0.0091 0.0353 0.0463 1.0000 0.0218 -0.0097 -0.1579 -0.1099 0.2881 0.0518 -0.0153
applicant_sex 0.0272 -0.0666 0.0433 -0.0035 -0.0794 0.0686 0.0218 1.0000 0.0292 0.0056 -0.0215 -0.0009 -0.0403 -0.0629
applicant_age 0.0478 -0.1318 0.0161 0.0954 -0.0122 0.0064 -0.0097 0.0292 1.0000 0.0387 -0.0089 0.0105 -0.0177 -0.0092
Black 0.1045 0.0034 0.0430 -0.0409 -0.0347 -0.0128 -0.1579 0.0056 0.0387 1.0000 -0.0264 -0.0246 -0.1005 -0.0598
AIAN 0.0326 -0.0221 -0.0019 -0.0392 -0.0274 0.0016 -0.1099 -0.0215 -0.0089 -0.0264 1.0000 -0.0089 -0.0361 -0.0215
NHPI 0.0389 -0.0066 0.0208 -0.0082 -0.0116 0.0001 0.2881 -0.0009 0.0105 -0.0246 -0.0089 1.0000 -0.0337 -0.0201
Asian 0.0323 0.0861 -0.0296 0.0995 0.0749 0.0150 0.0518 -0.0403 -0.0177 -0.1005 -0.0361 -0.0337 1.0000 -0.0819
Joint 0.0028 0.0529 -0.0263 0.0297 0.1145 -0.0150 -0.0153 -0.0629 -0.0092 -0.0598 -0.0215 -0.0201 -0.0819 1.0000
loadPkg("corrplot")
corrplot(cor_loans)

Chi-Square Test for Loan Approval and Race: result=>reject the null of independence: all tests REJECT THE NULL HYPOTHESIS OF INDEPENDENCE NOW THAT WE HAVE MORE DATA (Applicant age is the only issue)

contable1 = table(loans$derived_race, loans$action_taken)
xkabledply(contable1, title="Contingency table for Loan Approval and Race")
Contingency table for Loan Approval and Race
1 3
2 or more minority races 75 52
American Indian or Alaska Native 170 88
Asian 2672 824
Black or African American 1242 663
Joint 1062 287
Native Hawaiian or Other Pacific Islander 158 85
Race Not Available 3287 1008
White 14107 2915
chitest1 = chisq.test(contable1)
chitest1
## 
##  Pearson's Chi-squared test
## 
## data:  contable1
## X-squared = 492, df = 7, p-value <2e-16
contable2 = table(loans$derived_sex, loans$action_taken)
xkabledply(contable2, title="Contingency table for Loan Approval and Sex")
Contingency table for Loan Approval and Sex
1 3
Female 4639 1346
Joint 10354 2065
Male 6771 2161
Sex Not Available 1009 350
chitest2 = chisq.test(contable2)
chitest2
## 
##  Pearson's Chi-squared test
## 
## data:  contable2
## X-squared = 225, df = 3, p-value <2e-16
contable3 = table(loans$derived_ethnicity, loans$action_taken)
xkabledply(contable3, title="Contingency table for Loan Approval and Ethnicity")
Contingency table for Loan Approval and Ethnicity
1 3
Ethnicity Not Available 2891 830
Hispanic or Latino 4145 1291
Joint 1229 285
Not Hispanic or Latino 14508 3516
chitest3 = chisq.test(contable3)
chitest3
## 
##  Pearson's Chi-squared test
## 
## data:  contable3
## X-squared = 56, df = 3, p-value = 5e-12
contable4 = table(loans$applicant_age, loans$action_taken)
xkabledply(contable4, title="Contingency table for Loan Approval and Age")
Contingency table for Loan Approval and Age
1 3
<25 253 58
>74 1621 373
25-34 3897 823
35-44 5606 1417
45-54 4924 1382
55-64 3674 1089
65-74 2798 779
8888 0 1
chitest4 = chisq.test(contable4)
chitest4
## 
##  Pearson's Chi-squared test
## 
## data:  contable4
## X-squared = 63, df = 7, p-value = 4e-11

#adding New Code for final and downsampling

hmda <- data.frame(loans)
str(hmda$action_taken)
##  int [1:28695] 1 1 1 3 1 1 1 1 3 1 ...
hmda$approval<- ifelse(hmda$action_taken=="1", "Approved","Denied")
hmda$approval<- factor(hmda$approval)
str(hmda)
## 'data.frame':    28695 obs. of  20 variables:
##  $ derived_ethnicity    : Factor w/ 4 levels "Ethnicity Not Available",..: 4 2 2 2 2 2 1 2 2 2 ...
##  $ derived_race         : Factor w/ 8 levels "2 or more minority races",..: 8 7 8 8 8 8 7 8 8 8 ...
##  $ derived_sex          : Factor w/ 4 levels "Female","Joint",..: 3 2 3 2 1 3 1 3 3 3 ...
##  $ action_taken         : int  1 1 1 3 1 1 1 1 3 1 ...
##  $ loan_amount          : num  NA 175000 315000 115000 165000 265000 445000 335000 315000 285000 ...
##  $ interest_rate        : num  5.25 NA NA 4.46 NA ...
##  $ property_value       : num  NA 235000 415000 145000 435000 415000 705000 425000 865000 405000 ...
##  $ income               : num  NA 72 146 42 64 48 101 125 71 78 ...
##  $ applicant_ethnicity.1: int  2 1 1 1 1 1 3 1 1 1 ...
##  $ applicant_race.1     : int  5 6 5 5 5 5 6 5 5 5 ...
##  $ applicant_sex        : int  1 1 1 1 2 1 2 1 1 1 ...
##  $ applicant_age        : Factor w/ 8 levels "<25",">74","25-34",..: 4 4 4 3 4 4 4 5 5 4 ...
##  $ Black                : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ AIAN                 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ NHPI                 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Asian                : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Joint                : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ N_A                  : num  0 1 0 0 0 0 1 0 0 0 ...
##  $ two_or               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ approval             : Factor w/ 2 levels "Approved","Denied": 1 1 1 2 1 1 1 1 2 1 ...
table(hmda$approval)
## 
## Approved   Denied 
##    22773     5922
#Approved   Denied 
   #22773     5922 

Checking frequency of approval variable and derived_race variable

with(hmda,
     {
       print(table(derived_race))
       print(table(approval))
     }
)
## derived_race
##                  2 or more minority races 
##                                       127 
##          American Indian or Alaska Native 
##                                       258 
##                                     Asian 
##                                      3496 
##                 Black or African American 
##                                      1905 
##                                     Joint 
##                                      1349 
## Native Hawaiian or Other Pacific Islander 
##                                       243 
##                        Race Not Available 
##                                      4295 
##                                     White 
##                                     17022 
## approval
## Approved   Denied 
##    22773     5922

Creating DownSample of hmda dataset, the Approved far outnumber the Denied making this dataset severely unbalanced #Approved: 22773 Denied: 5922

Approved<- which(hmda$action_taken=="1")
Denied<- which(hmda$action_taken=="3")

length(Approved)
## [1] 22773
length(Denied)
## [1] 5922
approved_downsample<-sample(Approved,length(Denied))
hmda_down<- hmda[c(approved_downsample, Denied),]
str(hmda_down)
## 'data.frame':    11844 obs. of  20 variables:
##  $ derived_ethnicity    : Factor w/ 4 levels "Ethnicity Not Available",..: 1 4 4 4 4 3 2 4 4 4 ...
##  $ derived_race         : Factor w/ 8 levels "2 or more minority races",..: 7 4 8 8 4 8 8 8 8 8 ...
##  $ derived_sex          : Factor w/ 4 levels "Female","Joint",..: 3 1 1 3 1 2 3 2 1 2 ...
##  $ action_taken         : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ loan_amount          : num  105000 245000 245000 135000 245000 125000 455000 185000 195000 615000 ...
##  $ interest_rate        : num  NA 2.88 3.88 5.12 4.38 ...
##  $ property_value       : num  465000 375000 525000 225000 445000 ...
##  $ income               : num  154 77 118 0 78 75 101 181 79 162 ...
##  $ applicant_ethnicity.1: int  3 2 2 2 2 1 1 2 2 2 ...
##  $ applicant_race.1     : int  6 3 5 5 3 5 5 5 5 5 ...
##  $ applicant_sex        : int  1 2 2 1 2 1 1 2 2 1 ...
##  $ applicant_age        : Factor w/ 8 levels "<25",">74","25-34",..: 3 3 4 5 4 3 5 6 7 4 ...
##  $ Black                : num  0 1 0 0 1 0 0 0 0 0 ...
##  $ AIAN                 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ NHPI                 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Asian                : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Joint                : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ N_A                  : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ two_or               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ approval             : Factor w/ 2 levels "Approved","Denied": 1 1 1 1 1 1 1 1 1 1 ...
hmda_downnum<-hmda_down[c(4:11, 13:17, 19)]
hmda_downnum$approved<- ifelse(hmda_downnum$action_taken=="1", 1, 0)
hmda_downnum$denied<- ifelse(hmda_downnum$action_taken=="3", 1, 0)

str(hmda_downnum)
## 'data.frame':    11844 obs. of  16 variables:
##  $ action_taken         : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ loan_amount          : num  105000 245000 245000 135000 245000 125000 455000 185000 195000 615000 ...
##  $ interest_rate        : num  NA 2.88 3.88 5.12 4.38 ...
##  $ property_value       : num  465000 375000 525000 225000 445000 ...
##  $ income               : num  154 77 118 0 78 75 101 181 79 162 ...
##  $ applicant_ethnicity.1: int  3 2 2 2 2 1 1 2 2 2 ...
##  $ applicant_race.1     : int  6 3 5 5 3 5 5 5 5 5 ...
##  $ applicant_sex        : int  1 2 2 1 2 1 1 2 2 1 ...
##  $ Black                : num  0 1 0 0 1 0 0 0 0 0 ...
##  $ AIAN                 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ NHPI                 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Asian                : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Joint                : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ two_or               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ approved             : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ denied               : num  0 0 0 0 0 0 0 0 0 0 ...
cor_loans<-cor(hmda_downnum, use="complete.obs")
xkabledply(cor_loans)
Table
action_taken loan_amount interest_rate property_value income applicant_ethnicity.1 applicant_race.1 applicant_sex Black AIAN NHPI Asian Joint two_or approved denied
action_taken 1.0000 -0.2425 0.3537 -0.0972 -0.1224 0.0067 0.0055 0.0261 0.1208 0.0348 0.0383 0.0400 0.0033 0.0412 -1.0000 1.0000
loan_amount -0.2425 1.0000 -0.2344 0.4319 0.3160 0.0050 0.0214 -0.0522 -0.0141 -0.0197 -0.0104 0.0522 0.0362 -0.0108 0.2425 -0.2425
interest_rate 0.3537 -0.2344 1.0000 -0.0907 -0.1297 0.0033 -0.0137 0.0424 0.0556 -0.0072 0.0172 -0.0159 -0.0203 0.0228 -0.3537 0.3537
property_value -0.0972 0.4319 -0.0907 1.0000 0.3938 0.0111 0.0241 -0.0021 -0.0330 -0.0351 0.0037 0.0955 0.0183 -0.0238 0.0972 -0.0972
income -0.1224 0.3160 -0.1297 0.3938 1.0000 0.0285 0.0369 -0.1041 -0.0294 -0.0211 -0.0109 0.0588 0.1286 -0.0045 0.1224 -0.1224
applicant_ethnicity.1 0.0067 0.0050 0.0033 0.0111 0.0285 1.0000 0.0521 0.0656 -0.0172 -0.0035 0.0034 0.0288 0.0015 -0.0016 -0.0067 0.0067
applicant_race.1 0.0055 0.0214 -0.0137 0.0241 0.0369 0.0521 1.0000 0.0158 -0.1625 -0.1111 0.3513 0.0666 -0.0189 -0.0607 -0.0055 0.0055
applicant_sex 0.0261 -0.0522 0.0424 -0.0021 -0.1041 0.0656 0.0158 1.0000 -0.0002 -0.0248 -0.0047 -0.0407 -0.0717 -0.0087 -0.0261 0.0261
Black 0.1208 -0.0141 0.0556 -0.0330 -0.0294 -0.0172 -0.1625 -0.0002 1.0000 -0.0332 -0.0321 -0.1168 -0.0678 -0.0246 -0.1208 0.1208
AIAN 0.0348 -0.0197 -0.0072 -0.0351 -0.0211 -0.0035 -0.1111 -0.0248 -0.0332 1.0000 -0.0114 -0.0416 -0.0241 -0.0087 -0.0348 0.0348
NHPI 0.0383 -0.0104 0.0172 0.0037 -0.0109 0.0034 0.3513 -0.0047 -0.0321 -0.0114 1.0000 -0.0402 -0.0233 -0.0084 -0.0383 0.0383
Asian 0.0400 0.0522 -0.0159 0.0955 0.0588 0.0288 0.0666 -0.0407 -0.1168 -0.0416 -0.0402 1.0000 -0.0849 -0.0308 -0.0400 0.0400
Joint 0.0033 0.0362 -0.0203 0.0183 0.1286 0.0015 -0.0189 -0.0717 -0.0678 -0.0241 -0.0233 -0.0849 1.0000 -0.0178 -0.0033 0.0033
two_or 0.0412 -0.0108 0.0228 -0.0238 -0.0045 -0.0016 -0.0607 -0.0087 -0.0246 -0.0087 -0.0084 -0.0308 -0.0178 1.0000 -0.0412 0.0412
approved -1.0000 0.2425 -0.3537 0.0972 0.1224 -0.0067 -0.0055 -0.0261 -0.1208 -0.0348 -0.0383 -0.0400 -0.0033 -0.0412 1.0000 -1.0000
denied 1.0000 -0.2425 0.3537 -0.0972 -0.1224 0.0067 0.0055 0.0261 0.1208 0.0348 0.0383 0.0400 0.0033 0.0412 -1.0000 1.0000
loadPkg("corrplot")
corrplot(cor_loans)

Conducting Chi-squared Test on the downsampled dataset (hmda_down)

P-Value for contable1 is lower than .05 therefore we reject the null hypothesis. action_taken and derived_race are not independent of each other

contable1= table(hmda_down$derived_race, hmda_down$action_taken)
xkabledply(contable1, title="Contingency table for Loan Approval and Race")
Contingency table for Loan Approval and Race
1 3
2 or more minority races 16 52
American Indian or Alaska Native 42 88
Asian 691 824
Black or African American 287 663
Joint 274 287
Native Hawaiian or Other Pacific Islander 46 85
Race Not Available 822 1008
White 3744 2915
chitest1 = chisq.test(contable1)
chitest1
## 
##  Pearson's Chi-squared test
## 
## data:  contable1
## X-squared = 330, df = 7, p-value <2e-16
contable2= table(hmda_down$derived_ethnicity, hmda_down$action_taken)
xkabledply(contable2, title="Contingency table for Loan Approval and Ethnicity")
Contingency table for Loan Approval and Ethnicity
1 3
Ethnicity Not Available 726 830
Hispanic or Latino 1050 1291
Joint 341 285
Not Hispanic or Latino 3805 3516
chitest2 = chisq.test(contable2)
chitest2
## 
##  Pearson's Chi-squared test
## 
## data:  contable2
## X-squared = 48, df = 3, p-value = 2e-10
contable3= table(hmda_down$derived_sex, hmda_down$action_taken)
xkabledply(contable3, title="Contingency table for Loan Approval and Gender")
Contingency table for Loan Approval and Gender
1 3
Female 1202 1346
Joint 2682 2065
Male 1757 2161
Sex Not Available 281 350
chitest3 = chisq.test(contable3)
chitest3
## 
##  Pearson's Chi-squared test
## 
## data:  contable3
## X-squared = 138, df = 3, p-value <2e-16
contable4= table(hmda_down$applicant_age, hmda_down$action_taken)
xkabledply(contable4, title="Contingency table for Loan Approval and Age")
Contingency table for Loan Approval and Age
1 3
<25 69 58
>74 441 373
25-34 1046 823
35-44 1442 1417
45-54 1241 1382
55-64 927 1089
65-74 756 779
8888 0 1
chitest4 = chisq.test(contable4)
chitest4
## 
##  Pearson's Chi-squared test
## 
## data:  contable4
## X-squared = 55, df = 7, p-value = 1e-09

Full linear model for action_taken

full_model <- lm(action_taken~., data = loans)
summary(full_model)
## 
## Call:
## lm(formula = action_taken ~ ., data = loans)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.6631 -0.4953 -0.2729  0.0613  2.1640 
## 
## Coefficients: (7 not defined because of singularities)
##                                                        Estimate Std. Error
## (Intercept)                                            9.77e-01   1.05e-01
## derived_ethnicityHispanic or Latino                    1.04e-01   2.49e-02
## derived_ethnicityJoint                                 1.08e-01   3.27e-02
## derived_ethnicityNot Hispanic or Latino                1.28e-02   2.46e-02
## derived_raceAmerican Indian or Alaska Native          -2.46e-01   9.26e-02
## derived_raceAsian                                     -3.36e-01   7.74e-02
## derived_raceBlack or African American                 -1.33e-01   7.84e-02
## derived_raceJoint                                     -3.56e-01   8.00e-02
## derived_raceNative Hawaiian or Other Pacific Islander -2.03e-01   9.67e-02
## derived_raceRace Not Available                        -4.13e-01   7.90e-02
## derived_raceWhite                                     -5.55e-01   7.63e-02
## derived_sexJoint                                       6.26e-03   1.98e-02
## derived_sexMale                                        1.09e-01   2.29e-02
## derived_sexSex Not Available                           1.30e-01   3.54e-02
## loan_amount                                           -8.11e-07   3.25e-08
## interest_rate                                          2.39e-01   7.67e-03
## property_value                                         2.05e-07   3.10e-08
## income                                                -8.06e-04   1.14e-04
## applicant_ethnicity.1                                  3.22e-03   3.47e-03
## applicant_race.1                                       4.28e-03   1.49e-03
## applicant_sex                                          1.29e-02   1.76e-02
## applicant_age>74                                      -1.42e-01   5.04e-02
## applicant_age25-34                                     3.76e-02   4.67e-02
## applicant_age35-44                                     1.05e-01   4.65e-02
## applicant_age45-54                                     1.18e-01   4.68e-02
## applicant_age55-64                                     8.29e-02   4.72e-02
## applicant_age65-74                                    -8.71e-03   4.80e-02
## Black                                                        NA         NA
## AIAN                                                         NA         NA
## NHPI                                                         NA         NA
## Asian                                                        NA         NA
## Joint                                                        NA         NA
## N_A                                                          NA         NA
## two_or                                                       NA         NA
##                                                       t value Pr(>|t|)    
## (Intercept)                                              9.32  < 2e-16 ***
## derived_ethnicityHispanic or Latino                      4.18  3.0e-05 ***
## derived_ethnicityJoint                                   3.29  0.00101 ** 
## derived_ethnicityNot Hispanic or Latino                  0.52  0.60210    
## derived_raceAmerican Indian or Alaska Native            -2.66  0.00784 ** 
## derived_raceAsian                                       -4.33  1.5e-05 ***
## derived_raceBlack or African American                   -1.70  0.09002 .  
## derived_raceJoint                                       -4.45  8.7e-06 ***
## derived_raceNative Hawaiian or Other Pacific Islander   -2.10  0.03599 *  
## derived_raceRace Not Available                          -5.23  1.7e-07 ***
## derived_raceWhite                                       -7.27  3.7e-13 ***
## derived_sexJoint                                         0.32  0.75170    
## derived_sexMale                                          4.74  2.1e-06 ***
## derived_sexSex Not Available                             3.68  0.00023 ***
## loan_amount                                            -24.97  < 2e-16 ***
## interest_rate                                           31.10  < 2e-16 ***
## property_value                                           6.60  4.2e-11 ***
## income                                                  -7.06  1.7e-12 ***
## applicant_ethnicity.1                                    0.93  0.35368    
## applicant_race.1                                         2.87  0.00415 ** 
## applicant_sex                                            0.73  0.46523    
## applicant_age>74                                        -2.82  0.00478 ** 
## applicant_age25-34                                       0.81  0.42031    
## applicant_age35-44                                       2.25  0.02437 *  
## applicant_age45-54                                       2.52  0.01161 *  
## applicant_age55-64                                       1.76  0.07914 .  
## applicant_age65-74                                      -0.18  0.85611    
## Black                                                      NA       NA    
## AIAN                                                       NA       NA    
## NHPI                                                       NA       NA    
## Asian                                                      NA       NA    
## Joint                                                      NA       NA    
## N_A                                                        NA       NA    
## two_or                                                     NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.788 on 23875 degrees of freedom
##   (4793 observations deleted due to missingness)
## Multiple R-squared:  0.12,   Adjusted R-squared:  0.119 
## F-statistic:  125 on 26 and 23875 DF,  p-value: <2e-16

Removing specific race fields due to NAs, running model again

loans_for_modelling <- loans[1:12]
full_model_2 <- lm(action_taken~., data = loans_for_modelling)
summary(full_model_2)
## 
## Call:
## lm(formula = action_taken ~ ., data = loans_for_modelling)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.6631 -0.4953 -0.2729  0.0613  2.1640 
## 
## Coefficients:
##                                                        Estimate Std. Error
## (Intercept)                                            9.77e-01   1.05e-01
## derived_ethnicityHispanic or Latino                    1.04e-01   2.49e-02
## derived_ethnicityJoint                                 1.08e-01   3.27e-02
## derived_ethnicityNot Hispanic or Latino                1.28e-02   2.46e-02
## derived_raceAmerican Indian or Alaska Native          -2.46e-01   9.26e-02
## derived_raceAsian                                     -3.36e-01   7.74e-02
## derived_raceBlack or African American                 -1.33e-01   7.84e-02
## derived_raceJoint                                     -3.56e-01   8.00e-02
## derived_raceNative Hawaiian or Other Pacific Islander -2.03e-01   9.67e-02
## derived_raceRace Not Available                        -4.13e-01   7.90e-02
## derived_raceWhite                                     -5.55e-01   7.63e-02
## derived_sexJoint                                       6.26e-03   1.98e-02
## derived_sexMale                                        1.09e-01   2.29e-02
## derived_sexSex Not Available                           1.30e-01   3.54e-02
## loan_amount                                           -8.11e-07   3.25e-08
## interest_rate                                          2.39e-01   7.67e-03
## property_value                                         2.05e-07   3.10e-08
## income                                                -8.06e-04   1.14e-04
## applicant_ethnicity.1                                  3.22e-03   3.47e-03
## applicant_race.1                                       4.28e-03   1.49e-03
## applicant_sex                                          1.29e-02   1.76e-02
## applicant_age>74                                      -1.42e-01   5.04e-02
## applicant_age25-34                                     3.76e-02   4.67e-02
## applicant_age35-44                                     1.05e-01   4.65e-02
## applicant_age45-54                                     1.18e-01   4.68e-02
## applicant_age55-64                                     8.29e-02   4.72e-02
## applicant_age65-74                                    -8.71e-03   4.80e-02
##                                                       t value Pr(>|t|)    
## (Intercept)                                              9.32  < 2e-16 ***
## derived_ethnicityHispanic or Latino                      4.18  3.0e-05 ***
## derived_ethnicityJoint                                   3.29  0.00101 ** 
## derived_ethnicityNot Hispanic or Latino                  0.52  0.60210    
## derived_raceAmerican Indian or Alaska Native            -2.66  0.00784 ** 
## derived_raceAsian                                       -4.33  1.5e-05 ***
## derived_raceBlack or African American                   -1.70  0.09002 .  
## derived_raceJoint                                       -4.45  8.7e-06 ***
## derived_raceNative Hawaiian or Other Pacific Islander   -2.10  0.03599 *  
## derived_raceRace Not Available                          -5.23  1.7e-07 ***
## derived_raceWhite                                       -7.27  3.7e-13 ***
## derived_sexJoint                                         0.32  0.75170    
## derived_sexMale                                          4.74  2.1e-06 ***
## derived_sexSex Not Available                             3.68  0.00023 ***
## loan_amount                                            -24.97  < 2e-16 ***
## interest_rate                                           31.10  < 2e-16 ***
## property_value                                           6.60  4.2e-11 ***
## income                                                  -7.06  1.7e-12 ***
## applicant_ethnicity.1                                    0.93  0.35368    
## applicant_race.1                                         2.87  0.00415 ** 
## applicant_sex                                            0.73  0.46523    
## applicant_age>74                                        -2.82  0.00478 ** 
## applicant_age25-34                                       0.81  0.42031    
## applicant_age35-44                                       2.25  0.02437 *  
## applicant_age45-54                                       2.52  0.01161 *  
## applicant_age55-64                                       1.76  0.07914 .  
## applicant_age65-74                                      -0.18  0.85611    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.788 on 23875 degrees of freedom
##   (4793 observations deleted due to missingness)
## Multiple R-squared:  0.12,   Adjusted R-squared:  0.119 
## F-statistic:  125 on 26 and 23875 DF,  p-value: <2e-16

#{r} #pcr.fit=pcr(action_taken~.,data=loans_for_modelling,scale=FALSE,validation ="CV") #ezids::xkabledplyhead(loans_for_modelling) #summary(pcr.fit) #

str(loans_for_modelling)
## 'data.frame':    28695 obs. of  12 variables:
##  $ derived_ethnicity    : Factor w/ 4 levels "Ethnicity Not Available",..: 4 2 2 2 2 2 1 2 2 2 ...
##  $ derived_race         : Factor w/ 8 levels "2 or more minority races",..: 8 7 8 8 8 8 7 8 8 8 ...
##  $ derived_sex          : Factor w/ 4 levels "Female","Joint",..: 3 2 3 2 1 3 1 3 3 3 ...
##  $ action_taken         : int  1 1 1 3 1 1 1 1 3 1 ...
##  $ loan_amount          : num  NA 175000 315000 115000 165000 265000 445000 335000 315000 285000 ...
##  $ interest_rate        : num  5.25 NA NA 4.46 NA ...
##  $ property_value       : num  NA 235000 415000 145000 435000 415000 705000 425000 865000 405000 ...
##  $ income               : num  NA 72 146 42 64 48 101 125 71 78 ...
##  $ applicant_ethnicity.1: int  2 1 1 1 1 1 3 1 1 1 ...
##  $ applicant_race.1     : int  5 6 5 5 5 5 6 5 5 5 ...
##  $ applicant_sex        : int  1 1 1 1 2 1 2 1 1 1 ...
##  $ applicant_age        : Factor w/ 8 levels "<25",">74","25-34",..: 4 4 4 3 4 4 4 5 5 4 ...

Changing action_taken to factor for modelling purposes

#{r} #loans_for_modelling = loans_for_modelling #loans_for_modelling$action_taken = #factor(loans_for_modelling$action_taken) #

Linear models for action_taken

action_taken_fit1 <- lm(action_taken ~ derived_race, data = loans_for_modelling, family = "binomial")
summary(action_taken_fit1)
## 
## Call:
## lm(formula = action_taken ~ derived_race, data = loans_for_modelling, 
##     family = "binomial")
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -0.819 -0.469 -0.343 -0.343  1.657 
## 
## Coefficients:
##                                                       Estimate Std. Error
## (Intercept)                                             1.8189     0.0712
## derived_raceAmerican Indian or Alaska Native           -0.1367     0.0870
## derived_raceAsian                                      -0.3475     0.0725
## derived_raceBlack or African American                  -0.1228     0.0735
## derived_raceJoint                                      -0.3934     0.0745
## derived_raceNative Hawaiian or Other Pacific Islander  -0.1193     0.0879
## derived_raceRace Not Available                         -0.3495     0.0723
## derived_raceWhite                                      -0.4764     0.0715
##                                                       t value Pr(>|t|)    
## (Intercept)                                             25.54  < 2e-16 ***
## derived_raceAmerican Indian or Alaska Native            -1.57    0.116    
## derived_raceAsian                                       -4.79  1.6e-06 ***
## derived_raceBlack or African American                   -1.67    0.095 .  
## derived_raceJoint                                       -5.28  1.3e-07 ***
## derived_raceNative Hawaiian or Other Pacific Islander   -1.36    0.175    
## derived_raceRace Not Available                          -4.84  1.3e-06 ***
## derived_raceWhite                                       -6.66  2.7e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.803 on 28687 degrees of freedom
## Multiple R-squared:  0.0172, Adjusted R-squared:  0.0169 
## F-statistic: 71.6 on 7 and 28687 DF,  p-value: <2e-16
xkablevif(action_taken_fit1)
VIFs of the model
derived_raceAmerican Indian or Alaska Native derived_raceAsian derived_raceBlack or African American derived_raceJoint derived_raceNative Hawaiian or Other Pacific Islander derived_raceRace Not Available derived_raceWhite
3 25.1 14.9 11.1 2.89 29.6 54.9
action_taken_fit2 <- lm(action_taken ~ derived_race + derived_ethnicity, data = loans_for_modelling, family = "binomial")
summary(action_taken_fit2)
## 
## Call:
## lm(formula = action_taken ~ derived_race + derived_ethnicity, 
##     data = loans_for_modelling, family = "binomial")
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -0.932 -0.452 -0.310 -0.310  1.690 
## 
## Coefficients:
##                                                       Estimate Std. Error
## (Intercept)                                             1.8324     0.0742
## derived_raceAmerican Indian or Alaska Native           -0.1938     0.0871
## derived_raceAsian                                      -0.3391     0.0724
## derived_raceBlack or African American                  -0.1151     0.0734
## derived_raceJoint                                      -0.3905     0.0745
## derived_raceNative Hawaiian or Other Pacific Islander  -0.1338     0.0878
## derived_raceRace Not Available                         -0.3800     0.0739
## derived_raceWhite                                      -0.4956     0.0714
## derived_ethnicityHispanic or Latino                     0.0998     0.0218
## derived_ethnicityJoint                                 -0.0151     0.0292
## derived_ethnicityNot Hispanic or Latino                -0.0265     0.0217
##                                                       t value Pr(>|t|)    
## (Intercept)                                             24.68  < 2e-16 ***
## derived_raceAmerican Indian or Alaska Native            -2.23    0.026 *  
## derived_raceAsian                                       -4.68  2.8e-06 ***
## derived_raceBlack or African American                   -1.57    0.117    
## derived_raceJoint                                       -5.24  1.6e-07 ***
## derived_raceNative Hawaiian or Other Pacific Islander   -1.52    0.127    
## derived_raceRace Not Available                          -5.14  2.7e-07 ***
## derived_raceWhite                                       -6.94  4.0e-12 ***
## derived_ethnicityHispanic or Latino                      4.58  4.8e-06 ***
## derived_ethnicityJoint                                  -0.52    0.606    
## derived_ethnicityNot Hispanic or Latino                 -1.22    0.223    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.801 on 28684 degrees of freedom
## Multiple R-squared:  0.0205, Adjusted R-squared:  0.0201 
## F-statistic: 59.9 on 10 and 28684 DF,  p-value: <2e-16
action_taken_fit3 <- lm(action_taken ~ derived_race + derived_ethnicity + derived_sex, data = loans_for_modelling, family = "binomial")
summary(action_taken_fit3)
## 
## Call:
## lm(formula = action_taken ~ derived_race + derived_ethnicity + 
##     derived_sex, data = loans_for_modelling, family = "binomial")
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -0.967 -0.460 -0.348 -0.243  1.757 
## 
## Coefficients:
##                                                       Estimate Std. Error
## (Intercept)                                            1.81615    0.07483
## derived_raceAmerican Indian or Alaska Native          -0.19405    0.08684
## derived_raceAsian                                     -0.31991    0.07220
## derived_raceBlack or African American                 -0.10896    0.07324
## derived_raceJoint                                     -0.31267    0.07460
## derived_raceNative Hawaiian or Other Pacific Islander -0.11520    0.08752
## derived_raceRace Not Available                        -0.36723    0.07377
## derived_raceWhite                                     -0.46918    0.07124
## derived_ethnicityHispanic or Latino                    0.11279    0.02292
## derived_ethnicityJoint                                 0.05885    0.03017
## derived_ethnicityNot Hispanic or Latino               -0.00361    0.02269
## derived_sexJoint                                      -0.10063    0.01303
## derived_sexMale                                        0.03765    0.01338
## derived_sexSex Not Available                           0.06637    0.02840
##                                                       t value Pr(>|t|)    
## (Intercept)                                             24.27  < 2e-16 ***
## derived_raceAmerican Indian or Alaska Native            -2.23   0.0255 *  
## derived_raceAsian                                       -4.43  9.4e-06 ***
## derived_raceBlack or African American                   -1.49   0.1368    
## derived_raceJoint                                       -4.19  2.8e-05 ***
## derived_raceNative Hawaiian or Other Pacific Islander   -1.32   0.1881    
## derived_raceRace Not Available                          -4.98  6.5e-07 ***
## derived_raceWhite                                       -6.59  4.6e-11 ***
## derived_ethnicityHispanic or Latino                      4.92  8.7e-07 ***
## derived_ethnicityJoint                                   1.95   0.0511 .  
## derived_ethnicityNot Hispanic or Latino                 -0.16   0.8736    
## derived_sexJoint                                        -7.72  1.2e-14 ***
## derived_sexMale                                          2.81   0.0049 ** 
## derived_sexSex Not Available                             2.34   0.0194 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.799 on 28681 degrees of freedom
## Multiple R-squared:  0.0261, Adjusted R-squared:  0.0256 
## F-statistic: 59.1 on 13 and 28681 DF,  p-value: <2e-16
xkablevif(action_taken_fit3)
VIFs of the model
derived_ethnicityHispanic or Latino derived_ethnicityJoint derived_ethnicityNot Hispanic or Latino derived_raceAmerican Indian or Alaska Native derived_raceAsian derived_raceBlack or African American derived_raceJoint derived_raceNative Hawaiian or Other Pacific Islander derived_raceRace Not Available derived_raceWhite derived_sexJoint derived_sexMale derived_sexSex Not Available
3.02 25.1 14.9 11.2 2.89 31.1 55 3.63 2.04 5.4 1.87 1.73 1.64
action_taken_fit4 <- lm(action_taken ~ derived_race + derived_ethnicity + derived_sex + applicant_age, data = loans_for_modelling, family = "binomial")
summary(action_taken_fit4)
## 
## Call:
## lm(formula = action_taken ~ derived_race + derived_ethnicity + 
##     derived_sex + applicant_age, data = loans_for_modelling, 
##     family = "binomial")
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -1.038 -0.443 -0.351 -0.217  1.838 
## 
## Coefficients:
##                                                       Estimate Std. Error
## (Intercept)                                            1.76283    0.08712
## derived_raceAmerican Indian or Alaska Native          -0.20408    0.08674
## derived_raceAsian                                     -0.32347    0.07211
## derived_raceBlack or African American                 -0.12462    0.07316
## derived_raceJoint                                     -0.31976    0.07451
## derived_raceNative Hawaiian or Other Pacific Islander -0.12763    0.08742
## derived_raceRace Not Available                        -0.37216    0.07369
## derived_raceWhite                                     -0.48170    0.07116
## derived_ethnicityHispanic or Latino                    0.12496    0.02296
## derived_ethnicityJoint                                 0.07118    0.03018
## derived_ethnicityNot Hispanic or Latino               -0.00347    0.02266
## derived_sexJoint                                      -0.09519    0.01309
## derived_sexMale                                        0.04860    0.01353
## derived_sexSex Not Available                           0.07155    0.02845
## applicant_age>74                                       0.04600    0.04885
## applicant_age25-34                                    -0.02077    0.04675
## applicant_age35-44                                     0.03461    0.04629
## applicant_age45-54                                     0.07294    0.04641
## applicant_age55-64                                     0.10195    0.04680
## applicant_age65-74                                     0.10573    0.04732
## applicant_age8888                                      1.53778    0.79949
##                                                       t value Pr(>|t|)    
## (Intercept)                                             20.24  < 2e-16 ***
## derived_raceAmerican Indian or Alaska Native            -2.35  0.01864 *  
## derived_raceAsian                                       -4.49  7.3e-06 ***
## derived_raceBlack or African American                   -1.70  0.08854 .  
## derived_raceJoint                                       -4.29  1.8e-05 ***
## derived_raceNative Hawaiian or Other Pacific Islander   -1.46  0.14431    
## derived_raceRace Not Available                          -5.05  4.4e-07 ***
## derived_raceWhite                                       -6.77  1.3e-11 ***
## derived_ethnicityHispanic or Latino                      5.44  5.3e-08 ***
## derived_ethnicityJoint                                   2.36  0.01835 *  
## derived_ethnicityNot Hispanic or Latino                 -0.15  0.87842    
## derived_sexJoint                                        -7.27  3.7e-13 ***
## derived_sexMale                                          3.59  0.00033 ***
## derived_sexSex Not Available                             2.52  0.01190 *  
## applicant_age>74                                         0.94  0.34641    
## applicant_age25-34                                      -0.44  0.65686    
## applicant_age35-44                                       0.75  0.45472    
## applicant_age45-54                                       1.57  0.11604    
## applicant_age55-64                                       2.18  0.02939 *  
## applicant_age65-74                                       2.23  0.02548 *  
## applicant_age8888                                        1.92  0.05443 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.798 on 28674 degrees of freedom
## Multiple R-squared:  0.0289, Adjusted R-squared:  0.0282 
## F-statistic: 42.7 on 20 and 28674 DF,  p-value: <2e-16
xkablevif(action_taken_fit4)
VIFs of the model
applicant_age>74 applicant_age25-34 applicant_age35-44 applicant_age45-54 applicant_age55-64 applicant_age65-74 applicant_age8888 derived_ethnicityHispanic or Latino derived_ethnicityJoint derived_ethnicityNot Hispanic or Latino derived_raceAmerican Indian or Alaska Native derived_raceAsian derived_raceBlack or African American derived_raceJoint derived_raceNative Hawaiian or Other Pacific Islander derived_raceRace Not Available derived_raceWhite derived_sexJoint derived_sexMale derived_sexSex Not Available
3.02 25.1 15 11.2 2.89 31.1 55.1 3.65 2.05 5.41 1.9 1.77 1.65 6.96 13.5 17.9 16.6 13.7 11 1

The multiple R^2s increase with each successive model, indicating slight improvements from action_taken_fit1 through action_taken_fit4.

ANOVA comparison of models

anova(action_taken_fit1,action_taken_fit2,action_taken_fit3,action_taken_fit4) -> anovaRes
str(anovaRes)
## Classes 'anova' and 'data.frame':    4 obs. of  6 variables:
##  $ Res.Df   : num  28687 28684 28681 28674
##  $ RSS      : num  18477 18415 18309 18256
##  $ Df       : num  NA 3 3 7
##  $ Sum of Sq: num  NA 61.9 105.7 53.5
##  $ F        : num  NA 32.4 55.3 12
##  $ Pr(>F)   : num  NA 6.41e-21 1.15e-35 2.23e-15
##  - attr(*, "heading")= chr [1:2] "Analysis of Variance Table\n" "Model 1: action_taken ~ derived_race\nModel 2: action_taken ~ derived_race + derived_ethnicity\nModel 3: action"| __truncated__
xkabledply(anovaRes, title = "ANOVA comparison between the models")
ANOVA comparison between the models
Res.Df RSS Df Sum of Sq F Pr(>F)
28687 18477 NA NA NA NA
28684 18415 3 61.9 32.4 0
28681 18309 3 105.7 55.3 0
28674 18256 7 53.5 12.0 0

#SideNotes on ANOVA Testing These are some constructs of my ANOVA test I left them here just incase I need to revist them, these might not be perfect because I ran multiple different variants, I assume it will not be used ultimately and if we need too we can run chi-squared on the balanced dataset.

Conducting ANOVA Test on action_taken variable and income, saved test as anova

anova= aov(income ~ approval, data=hmda) str(anova)

#Plotting ANOVA TEST

loadPkg(“ggplot2”) ggplot(hmda, aes(x=approval, y=income)) + geom_boxplot( colour=c(“#ff0000” ,“#11cc11”)) + labs(title=“Income difference between Approved and Denied Apps”, x=“Approval Status”, y = “Income”)

#plot(income ~ action_taken, data=hmda) anova= aov(action_taken ~ Black, data=hmda) # anovaRes # this does not give the easy-to-read result of the aov analysis names(anova) # summary(anovaRes) xkabledply(anova) # same exact result with or without re-ordering.